#pip install TextBlob
import sys
import csv
import nltk
import time
import re
import tqdm
import spacy
import string
import numpy as np
import pandas as pd
from collections import Counter
# Gensim
import gensim
import warnings
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
warnings.filterwarnings('ignore')
from textblob import TextBlob
# Import sklearn evaluation metrics
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Import machine learning models
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Import sklearn label processing library
from sklearn import preprocessing
# Import sklearn train-test split function
from sklearn.model_selection import train_test_split
# Import feature extraction tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# first step is to import and install if required, the Python packages.
# nltk libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer , wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
# sklearn libraries
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Fine\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Fine\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Fine\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
with open("chatgpt_text_conversation.txt") as f:
data = f.readlines()
data = pd.DataFrame(data)
data.columns = ['conversation']
data['conversation'] = data['conversation'].str.strip(',"\n')
data.head()
| conversation | |
|---|---|
| 0 | AI's capabilities are astounding, it's transfo... |
| 1 | Autonomous cars scare me, I don't trust machin... |
| 2 | AI algorithms predicting customer behavior are... |
| 3 | Concerned about AI replacing jobs, it could le... |
| 4 | AI's ability to analyze big data and derive in... |
# Split each conversation into words
data['words'] = data['conversation'].str.split()
# Count the number of words in each conversation
data['word_count'] = data['words'].apply(len)
# Sum the word counts to get the total number of words in the DataFrame
total_words = data['word_count'].sum()
print('Total words:', total_words)
Total words: 1504
data['conversation'][0]
"AI's capabilities are astounding, it's transforming industries!"
data.isna().sum()
conversation 0 words 0 word_count 0 dtype: int64
data.dropna(inplace=True)
data.shape
(124, 3)
def text_clean(text):
# remove punctuations
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
text = text.translate(translator)
# remove numeric values
text = re.sub('[0-9]', ' ', text)
# remove special characters and single characters
text = re.sub(r"\b[A-Za-zÀÃ-ž]\b",' ', text)
# lowercase text and remove extra spaces
text = re.sub('\s+',' ',text).lower().strip()
return text
def text_tokenize(text):
# tokenize text and remove stop words
return " ".join([word for word in text.split() if word not in stop_words])
To analyze the dataset, we have explored the dataset for null values and any irrelevant data attributes. We do not find any null instances but there are a lot punctuations etc. in the dataset. To increase data interpretability, make it more reliable and decrease computational cost we have removed all punctuation marks, numeric values, remove special characters, and lowercase text to get homogenized data and remove more than one extra space.
In the next step, we tokenized all text using nltk standard tokenizer and remove stopwords (e.g. the, is etc.) as these words do not provide any specific information.
This step is also part of a text processing pipeline that performs lemmatization, a process that converts words to their base or root form, known as "lemmas". This process helps in reducing the dimensionality of the data and consolidating different forms of a word into a single entity for better text analysis. e.g.running --> run
# Loading the spacy model
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
def lemmatize_text(text):
# Create a Doc object by processing the input text with the spacy model
doc = nlp(text)
# Perform lemmatization on the text and join the lemmas with spaces
text = [items.lemma_ for items in doc]
text = " ".join([i for i in text if len(i) >= 3])
# Remove single-character words (excluding numbers and special characters)
text = re.sub(r"\b[A-Za-zÀÃ-ž]\b",' ', text)
# Remove extra spaces and trim leading/trailing spaces
text = re.sub('\s+',' ',text).strip()
return text
# apply text clean function on text
data['text'] = data.conversation.apply(text_clean)
# apply text clean function on text
data['text'] = data.text.apply(text_tokenize)
# apply text clean function on text
data['text'] = data.text.apply(lemmatize_text)
# Import the wordcloud library
from wordcloud import WordCloud
plt.figure(figsize=(10,15))
# Join the different processed titles together.
long_string = ','.join(list(data['text'].values))
# Create a WordCloud object
wordcloud = WordCloud()
# Generate a word cloud
cloud = wordcloud.generate(long_string)
# Visualize the word cloud
plt.imshow(cloud)
<matplotlib.image.AxesImage at 0x19c4544f340>
# find the length of word / tokens in each text
data['word_len'] = data['text'].str.len()
# find and display maximum length of text
print("Maximum text length: ", data['word_len'].max())
# find and display minimum length of text
print("Minimum text length: ", data['word_len'].min())
# find and display average length of text
print("Average text length: ", data['word_len'].mean())
Maximum text length: 97 Minimum text length: 31 Average text length: 59.88709677419355
# Create a dictionary from the list of words
texts = data['text'].str.split(" ")
dictionary = corpora.Dictionary(texts)
# filter tokens below frequency of 3 words
dictionary.filter_extremes(no_below=1, no_above=1)
# Create a Bag of Words (BOW) representation of the corpus
corpus_bow = [dictionary.doc2bow(text) for text in texts]
# Count the number of unique tokens in the dictionary
len(dictionary)
536
# Print out the first four documents in the bow representation
for doc in corpus_bow[:4]:
print(doc)
[(0, 1), (1, 1), (2, 1), (3, 1)] [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)] [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)] [(17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)]
lda_model = models.LdaMulticore(corpus=corpus_bow,
id2word=dictionary,
num_topics=5,
random_state=100,
chunksize=5,
passes=1000,
minimum_probability=0)
# Applies the trained LDA model to the corpus_bow and returns a list of tuples containing
#topic probabilities for each document in the corpus.
corpus_lda = lda_model[corpus_bow]
# Creates a CoherenceModel object from the gensim library with the 'c_v' coherence metric.
# Calculates the coherence score for the LDA model using the get_coherence() method of the CoherenceModel object.
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
print("Coherence score of LDA model:", coherence_model_lda.get_coherence())
Coherence score of LDA model: 0.5269021784344105
# find number of topics
topics = lda_model.show_topics(formatted=False,num_words = 20)
# for odd number of topics
if len(topics)%2 == 1:
sub_plts = int(len(topics)/2)
sub_plts = sub_plts+1
# for even number of topics
else:
sub_plts = int(len(topics)/2)
fig, axes = plt.subplots(sub_plts, 2, figsize=(5,5), sharex=True, sharey=True)
# for odd number of topics
if len(topics)%2 == 1:
axes_p = axes.flatten()[:-1]
# for even number of topics
else:
axes_p = axes.flatten()
# iterate over topics
for i, ax in enumerate(axes_p):
fig.add_subplot(ax)
topic_words = dict(topics[i][1])
# generate wordcloud
cloud = WordCloud(background_color='white')
cloud.generate_from_frequencies(topic_words, max_font_size=300)
plt.gca().imshow(cloud)
plt.gca().set_title('Topic ' + str(i+1), fontdict=dict(size=16))
plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
#plt.pad=0
plt.show()
keywords = {}
topics_list = ['Blockchain', 'Internet of Things', 'Artificial Intelligence', 'Cloud Computing', 'Virtual Reality']
for idx, topic in lda_model.print_topics(-1, num_words=20):
pattern =r'"([A-Za-z0-9_\./\\-]*)"'
matches = re.findall(pattern, topic)
keyword = [i for i in matches]
print (topics_list[idx],"::\t ", ", ".join(keyword),"\n")
keywords[idx+1] = ", ".join(keyword)
Blockchain :: high, cloud, concern, need, security, offer, cost, ibm, provide, blockchain, video, service, could, strong, aws, get, adoption, doorbell, ring, bit Internet of Things :: lack, friendly, feature, sometimes, complex, user, device, improvement, setup, vive, azure, affordable, microsoft, samsung, great, seamless, human, require, support, graphic Artificial Intelligence :: bitcoin, fast, make, transfer, transaction, amazon, capability, interface, international, ripple, intuitive, provide, slow, alexa, echo, blockchain, litecoin, technology, great, nature Cloud Computing :: google, cloud, make, impressive, customer, changer, game, accurate, facial, recognition, easy, revolutionize, learn, machine, help, nest, process, crm, salesforce, data Virtual Reality :: smart, contract, experience, world, offer, datum, life, google, battery, ethereum, playstation, track, feel, game, apple, home, nest, mind, still, real
# Calculate the average topic share for each document
# The following code calculates the average topic share for each document in a given corpus.
# Initialize an empty list to store the topic probabilities for each document
doc_topic_probs = []
# Iterate over each document's topic distribution in the LDA model
for m in lda_model[corpus_bow]:
# Extract the topic probabilities from the document's topic distribution
# Append the list of topic probabilities to the `doc_topic_probs` list
doc_topic_probs.append([x[1] for x in m])
#At the end of the loop, doc_topic_probs will contain the topic probabilities
# for each document in the corpus.
topics_weights = pd.DataFrame(doc_topic_probs)
# Define a function to generate column names based on the number of columns
def generate_column_names(num_columns):
# Initialize an empty list to store the column names
column_names = []
# Iterate through the range of the number of columns
for i in range(num_columns):
# Create a column name using an f-string with the index (i) incremented by 1
column_name = f"Topic_{i + 1}"
# Append the generated column name to the 'column_names' list
column_names.append(column_name)
# Return the list of generated column names
return column_names
# Generate column names based on the length of 'topics_weights' columns
column_names = generate_column_names(len(topics_weights.columns))
# Assign the generated column names to the columns of 'topics_weights' DataFrame
topics_weights.columns = column_names
# Display the 'topics_weights' DataFrame
topics_weights
| Topic_1 | Topic_2 | Topic_3 | Topic_4 | Topic_5 | |
|---|---|---|---|---|---|
| 0 | 0.040009 | 0.040007 | 0.839966 | 0.040010 | 0.040008 |
| 1 | 0.028582 | 0.028580 | 0.028738 | 0.170202 | 0.743898 |
| 2 | 0.025004 | 0.025003 | 0.025004 | 0.899985 | 0.025004 |
| 3 | 0.316436 | 0.597821 | 0.028581 | 0.028582 | 0.028580 |
| 4 | 0.022227 | 0.022226 | 0.022227 | 0.022227 | 0.911094 |
| ... | ... | ... | ... | ... | ... |
| 119 | 0.124059 | 0.018184 | 0.731771 | 0.018339 | 0.107646 |
| 120 | 0.022338 | 0.253461 | 0.022253 | 0.022226 | 0.679722 |
| 121 | 0.022224 | 0.022224 | 0.911103 | 0.022224 | 0.022224 |
| 122 | 0.022225 | 0.022257 | 0.801055 | 0.132238 | 0.022225 |
| 123 | 0.197801 | 0.018208 | 0.018345 | 0.018185 | 0.747461 |
124 rows × 5 columns
data["doc_name"] = data.index.to_series().apply(lambda x: f"doc_{x+1}")
# Find the dominant topic in each document
list_of_dict = []
# Get the document topics for each document in the corpus_tfidf and set minimum_probability to 0.0
doc_topics = lda_model.get_document_topics(corpus_bow, minimum_probability=0.0)
print('document topics: ', doc_topics)
# Iterate through the document topics
for each_doc in doc_topics:
# Sort the topics in descending order of probability
topics = sorted(each_doc, key=lambda x: x[1], reverse=True)
# Initialize an empty dictionary to store topic information
my_dict = {}
# Iterate through the sorted topics and their probabilities
for top, (topic_num, topic_prob) in enumerate(topics):
if top == 0: # The first item in the list represents the most dominant topic
my_dict["Topic Number"] = topic_num+1
my_dict['Topic Name'] = topics_list[topic_num]
# Append the dictionary containing topic information to the list_of_dict
list_of_dict.append(my_dict)
topics_name = pd.DataFrame(list_of_dict)
data['topic'] = topics_name['Topic Name']
topics_name
document topics: <gensim.interfaces.TransformedCorpus object at 0x0000019C46834D60>
| Topic Number | Topic Name | |
|---|---|---|
| 0 | 3 | Artificial Intelligence |
| 1 | 5 | Virtual Reality |
| 2 | 4 | Cloud Computing |
| 3 | 2 | Internet of Things |
| 4 | 5 | Virtual Reality |
| ... | ... | ... |
| 119 | 3 | Artificial Intelligence |
| 120 | 5 | Virtual Reality |
| 121 | 3 | Artificial Intelligence |
| 122 | 3 | Artificial Intelligence |
| 123 | 5 | Virtual Reality |
124 rows × 2 columns
np.unique(topics_name['Topic Name'], return_counts=True)
(array(['Artificial Intelligence', 'Blockchain', 'Cloud Computing',
'Internet of Things', 'Virtual Reality'], dtype=object),
array([26, 23, 19, 30, 26], dtype=int64))
from sklearn.decomposition import PCA
topics_weights = np.array(topics_weights)
components_count= 3
# map topics to PCA a feature dimension reduction technique
lda_pca = PCA(n_components=components_count).fit(topics_weights.transpose())
lda_pca = pd.DataFrame(lda_pca.components_)
lda_pca = lda_pca.transpose()
lda_pca.columns = ['x','y','z']
# get max probability of each topic to plot the size of graph
max_prob = topics_weights.max(axis=1)
size = (1+np.array(max_prob)*20)**3
import plotly.graph_objects as go
# Concatenate the t-SNE output and topics
df = pd.concat([lda_pca, topics_name['Topic Number']], axis=1)
# Create a 3D scatter plot
fig = go.Figure()
for topic in df['Topic Number'].unique():
fig.add_trace(go.Scatter3d(
x = df[df['Topic Number'] == topic]['x'],
y = df[df['Topic Number'] == topic]['y'],
z = df[df['Topic Number'] == topic]['z'],
mode = 'markers',
marker = dict(
size = 5,
opacity = 0.8
),
name = f'Topic {topic}'
))
# Update layout to include title
fig.update_layout(scene = dict(
xaxis_title='Component 1',
yaxis_title='Component 2',
zaxis_title='Component 3'),
margin=dict(r=20, b=10, l=10, t=35),
title='Topic Model 3D scatter plot')
fig.show()
# Create a dictionary of the BOW scores for each word in the corpus
word_bow = {}
for doc in corpus_bow:
for word, bow_score in doc:
if word in word_bow:
word_bow[word] += bow_score
else:
word_bow[word] = bow_score
# Sort the dictionary by the BOW scores and get word freq
top_words = sorted(word_bow.items(), key=lambda x: x[1], reverse=True)
print("Top 20 most frequent words bow")
print("================================\n")
# Print the top 20 words
top_words_dict = {}
for word, score in top_words[:20]:
print(f'{dictionary[word]}: {np.round(score,3)}')
top_words_dict[dictionary[word]] = np.round(score,3)
Top 20 most frequent words bow ================================ cloud: 16 offer: 13 google: 12 make: 11 smart: 11 high: 10 concern: 9 lack: 9 need: 8 game: 7 blockchain: 7 provide: 7 experience: 6 cost: 6 security: 6 life: 6 sometimes: 6 friendly: 6 bitcoin: 6 contract: 6
# define figure size
plt.figure(figsize=(10, 6))
# set xlables style verticle
plt.xticks(rotation=90)
# plot unigrams on x axis and there freq on y axis
ax = sns.barplot(x=list(top_words_dict.keys()), y= list(top_words_dict.values()))
# set x and y axis labels
ax.set(xlabel='Words', ylabel='Frequency')
[Text(0.5, 0, 'Words'), Text(0, 0.5, 'Frequency')]
def getPolarity(text):
score = TextBlob(text).sentiment.polarity
if score >= 0:
return "Positive"
else:
return "Negative"
data['sentiment'] = data.text.apply(getPolarity)
neg_data = data.loc[data['sentiment']=="Negative"]
plt.figure(figsize=(5,8))
# Join the different processed titles together.
long_string = ','.join(list(neg_data['text'].values))
# Create a WordCloud object
wordcloud = WordCloud()
# Generate a word cloud
cloud = wordcloud.generate(long_string)
# Visualize the word cloud
plt.imshow(cloud)
<matplotlib.image.AxesImage at 0x19c4c394850>
pos_data = data.loc[data['sentiment']=="Positive"]
plt.figure(figsize=(5,8))
# Join the different processed titles together.
long_string = ','.join(list(neg_data['text'].values))
# Create a WordCloud object
wordcloud = WordCloud()
# Generate a word cloud
cloud = wordcloud.generate(long_string)
# Visualize the word cloud
plt.imshow(cloud)
<matplotlib.image.AxesImage at 0x19c4c345ee0>
topics = data['topic'].unique()
sentiments = data['sentiment'].unique()
fig = go.Figure()
topic_buttons = []
sentiment_buttons = []
for i, topic in enumerate(topics):
for j, sentiment in enumerate(sentiments):
filtered_df = data[(data['topic'] == topic) & (data['sentiment'] == sentiment)]
fig.add_trace(
go.Bar(
x=[sentiment],
y=[len(filtered_df)],
name=f'{topic}-{sentiment}',
visible=True if i == 0 else 'legendonly'
)
)
visible = [False]*len(topics)*len(sentiments)
visible[i*len(sentiments):(i+1)*len(sentiments)] = [True]*len(sentiments)
topic_buttons.append(
dict(
label=topic,
method='update',
args=[{'visible': visible}]
)
)
fig.update_layout(
updatemenus=[
dict(
type='dropdown',
direction='down',
x=1,
y=1,
buttons=topic_buttons
)
],
barmode='stack'
)
fig.show()
print("Number of positive sentiments:",pos_data.shape[0])
print("Number of negative sentiments:",neg_data.shape[0])
Number of positive sentiments: 105 Number of negative sentiments: 19
# Count sentiment occurrences
sentiment_counts = data['sentiment'].value_counts()
# Plot the data
sentiment_counts.plot(kind='pie', autopct='%1.1f%%', figsize=(10, 5))
plt.title('Overall Sentiment Distribution')
plt.show()
import plotly.express as px
fig = px.box(data, x="sentiment", y="topic", color='sentiment')
fig.show()
def pos_tagging(text):
doc = nlp(text)
return [(token.text, token.pos_) for token in doc]
# Apply POS tagging to the 'text' column of the dataframe
data['POS_tags'] = data['text'].apply(pos_tagging)
import plotly.express as px
# Unpack POS tags
all_tags = [tag for tags in data['POS_tags'] for _, tag in tags]
# Count each POS tag
tag_counts = pd.Series(all_tags).value_counts()
# Create the plot
fig = px.pie(tag_counts, values=tag_counts.values, names=tag_counts.index, title='POS Tag Distribution')
fig.show()
nlp = spacy.load("en_core_web_sm")
def named_entities(text):
doc = nlp(text)
return [ent.label_ for ent in doc.ents]
# Apply NER to the 'text' column of the dataframe
data['NER_tags'] = data['text'].apply(named_entities)
# Count the occurrences of each named entity
ner_counts = Counter([ner for ners in data['NER_tags'] for ner in ners])
# Prepare data for plotting
labels, values = zip(*ner_counts.items())
# Plot the results
plt.figure(figsize=(10,5))
plt.bar(labels, values)
plt.xlabel('Named Entity Types')
plt.ylabel('Count')
plt.title('Named Entity Recognition')
plt.xticks(rotation=90)
plt.show()
def chunk_text(text):
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
grammar = """
NP: {<DT>?<JJ>*<NN>} # Chunk sequences of DT, JJ, and NN
"""
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(pos_tags)
return str(tree)
data['parsed_tree'] = data['text'].apply(chunk_text)
data['parsed_tree']
0 (S\n (NP capability/NN)\n astounding/VBG\n ...
1 (S\n (NP autonomous/JJ car/NN)\n (NP scare/N...
2 (S\n (NP algorithm/NN)\n predict/VBP\n (NP ...
3 (S\n (NP concern/NN)\n replace/VB\n (NP job...
4 (S\n (NP ability/NN)\n analyze/VBP\n (NP bi...
...
119 (S\n (NP bitcoin/NN)\n provide/VBP\n (NP ea...
120 (S\n (NP ethereum/NN)\n (NP smart/NN)\n (NP...
121 (S\n (NP ripple/NN)\n (NP fast/JJ internatio...
122 (S\n (NP litecoin/NN)\n (NP fast/JJ block/NN...
123 (S\n (NP chainlink/NN)\n bring/VBG\n (NP re...
Name: parsed_tree, Length: 124, dtype: object
# Initialize the vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the data
vectorizer.fit(data['text'])
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
# Define a function to get the top n tf-idf features for a document
def top_tfidf_features(row, features, top_n=2):
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df['feature'].values
# Apply TF-IDF to the data
tfidf_matrix = vectorizer.transform(data['text']).toarray()
# Apply the function to get the top n features
data['text_summary'] = [" ".join(top_tfidf_features(row, feature_names)) for row in tfidf_matrix]
data['text_summary']
0 astounding transform
1 car trust
2 marketing predict
3 unemployment job
4 ability derive
...
119 volatile digital
120 flexibility write
121 put centralization
122 popularity overshadowed
123 early stage
Name: text_summary, Length: 124, dtype: object
# takes data (dataframe format) as input.
# encode label values using sklearn label encoder.
# return encoded values and encoder object.
le = preprocessing.LabelEncoder()
le.fit(data['sentiment'])
data['en_label'] = le.transform(data['sentiment'])
# Get input features from data
input_data = data['text']
# Get output labels from data
output_data = data['en_label']
# Split data into train and test using sklearn train-test split function with 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(
input_data, output_data, test_size=0.20, random_state=2)
# Initialize tfidf vectorizer with maximum 500 features
vectorizer = TfidfVectorizer(max_features=500)
# Fit and transform vectorizer on train data
X_train = vectorizer.fit_transform(X_train)
# Transform vectorizer on test data
X_test = vectorizer.transform(X_test)
print("Train data input features shape: ", X_train.shape)
print("Test data input features shape: ", X_test.shape)
Train data input features shape: (99, 459) Test data input features shape: (25, 459)
print("Vocabulary size: ", len(vectorizer.get_feature_names()))
Vocabulary size: 459
def train_evalute_model(model, model_name, X_train, X_test, y_train, y_test):
"""Return results of model.
Function to train and predict the test data, and calculate evalution metrices.
Takes model, train and test data as input.
Train model on train data.
Test model performance on test data.
Evaluate model performance using accuracy, precision, recall, f1 and
confusion matrix evaluation metrics. (Use sklearn evaluation metrics)
"""
start_time = time.time()
model.fit(X_train.toarray(), y_train)
y_pred_test = model.predict(X_test.toarray())
result = {}
result['Model name'] = model_name
result['Accuracy'] = accuracy_score(y_test, y_pred_test)
result['Precision'] = precision_score(y_test, y_pred_test, average='macro')
result['Recall'] = recall_score(y_test, y_pred_test, average='macro')
result['F1'] = f1_score(y_test, y_pred_test, average='macro')
result['Confusion matrix'] = confusion_matrix(y_test, y_pred_test)
print(model_name, "Model take --- %s seconds ---" % np.round((time.time() - start_time),3))
return result
results = []
# Logistic Regression
results.append(train_evalute_model(LogisticRegression(
random_state=0, solver='lbfgs'), 'LR', X_train, X_test, y_train, y_test))
# Random Forest Classifier
results.append(train_evalute_model(RandomForestClassifier(
n_estimators=100, random_state=0), 'RF', X_train, X_test, y_train, y_test))
# Multinomial NB
results.append(train_evalute_model(MultinomialNB(),
'NB', X_train, X_test, y_train, y_test))
# XGB Classifier
results.append(train_evalute_model(xgb.XGBClassifier(
objective="binary:logistic", random_state=42), 'XGB', X_train, X_test, y_train, y_test))
# Decision Tree Classifier
results.append(train_evalute_model(DecisionTreeClassifier(
random_state=0), 'DT', X_train, X_test, y_train, y_test))
LR Model take --- 0.065 seconds --- RF Model take --- 0.53 seconds --- NB Model take --- 0.03 seconds --- XGB Model take --- 0.364 seconds --- DT Model take --- 0.025 seconds ---
# Save results as a csv file
results = pd.DataFrame(results)
# Display results
results.iloc[:, :4]
| Model name | Accuracy | Precision | Recall | |
|---|---|---|---|---|
| 0 | LR | 0.88 | 0.440000 | 0.500000 |
| 1 | RF | 0.88 | 0.440000 | 0.500000 |
| 2 | NB | 0.88 | 0.440000 | 0.500000 |
| 3 | XGB | 0.92 | 0.958333 | 0.666667 |
| 4 | DT | 0.88 | 0.706522 | 0.643939 |
def plot_cm_matrix(data):
"""Generate confusion matrix of all models.
"""
for i in cm_results.iterrows():
print("\t ", i[1][1], "Model confusion matrix plot")
print("\t+====================================+")
disp = ConfusionMatrixDisplay(i[1][0], display_labels=le.classes_)
fig, ax = plt.subplots(figsize=(5, 5))
disp.plot(ax=ax)
plt.title("Confusion Matrix")
plt.show()
cm_results = results[['Confusion matrix', 'Model name']]
plot_cm_matrix(cm_results)
LR Model confusion matrix plot +====================================+
RF Model confusion matrix plot +====================================+
NB Model confusion matrix plot +====================================+
XGB Model confusion matrix plot +====================================+
DT Model confusion matrix plot +====================================+
n_grams = CountVectorizer(max_df=0.95, min_df=2, ngram_range = (1,2))
n_grams_vec = n_grams.fit_transform(data['text'])
n_grams_feature_names = n_grams.get_feature_names()
def cluster_model(data, features, number_of_clusters):
km = KMeans(n_clusters=number_of_clusters)
km.fit(features)
new_data = pd.DataFrame(data['text'])
new_data['label'] = km.fit_predict(features)
value_centroids = np.sort(km.cluster_centers_)[:, ::-1]
index_centroids = km.cluster_centers_.argsort()[:, ::-1]
return new_data, value_centroids, index_centroids
number_of_clusters = 5
new_data, value_centroids, index_centroids = cluster_model(data, n_grams_vec, number_of_clusters)
np.unique(new_data['label'], return_counts=True)
(array([0, 1, 2, 3, 4]), array([ 2, 15, 6, 2, 99], dtype=int64))
words_num = None
for i in range(number_of_clusters):
print("\n\t\t\tCluster",i+1)
print("+=========================================================+")
copy = new_data.loc[new_data['label'] == i]
results_total = Counter()
copy['text'].str.split().apply(results_total.update)
plot_words = dict(sorted(results_total.items(), key=lambda item: item[1],reverse = True)[:words_num])
# Generate a word cloud image
wordcloud = WordCloud(background_color="black").generate_from_frequencies(plot_words)
# define plotting figure size
plt.figure(figsize=(5,8))
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# dispaly figure size
plt.clf()
Cluster 1 +=========================================================+
Cluster 2 +=========================================================+
<Figure size 640x480 with 0 Axes>
Cluster 3 +=========================================================+
<Figure size 640x480 with 0 Axes>
Cluster 4 +=========================================================+
<Figure size 640x480 with 0 Axes>
Cluster 5 +=========================================================+
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
from sklearn.metrics.pairwise import cosine_similarity
# Initialize the vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the data and transform the 'text' column
tfidf_vectors = vectorizer.fit_transform(data['conversation'])
# Convert the tfidf_vectors to an array and store in the DataFrame
data['tfidf_vector'] = tfidf_vectors.toarray().tolist()
# Calculate the cosine similarity
csim = cosine_similarity(tfidf_vectors)
# Print the cosine similarity matrix
similarity_data = pd.DataFrame(csim)
similarity_data.columns = ["doc_"+str(i) for i in range(similarity_data.shape[0])]
similarity_data.to_excel("cs.xlsx")
nlp = spacy.load("en_core_web_sm")
data['noun_phrases'] = data['text'].apply(lambda text: list(nlp(text).noun_chunks))
data['noun_phrases']
0 [(capability, astounding), (industry)]
1 [(autonomous, car, scare, trust, machine, drive)]
2 [(algorithm), (customer, behavior), (marketing)]
3 [(concern), (unemployment)]
4 [(ability), (big, datum, derive, insight, mind...
...
119 [(bitcoin), (easy, entry, world, digital, curr...
120 [(ethereum, smart, contract), (flexibility, co...
121 [(ripple, fast, international, transfer, comme...
122 [(fast, block, generation, impressive, oversha...
123 [(chainlink), (real, world, datum, blockchain,...
Name: noun_phrases, Length: 124, dtype: object
def parse_dependencies(text):
doc = nlp(text)
return [(token.text, token.dep_, token.head.text) for token in doc]
data['parsed_dependencies'] = data['text'].apply(parse_dependencies)
data['parsed_dependencies']
0 [(capability, compound, astounding), (astoundi...
1 [(autonomous, amod, drive), (car, compound, sc...
2 [(algorithm, nsubj, predict), (predict, ROOT, ...
3 [(concern, nsubj, replace), (replace, compound...
4 [(ability, nsubj, analyze), (analyze, ROOT, an...
...
119 [(bitcoin, nsubj, provide), (provide, ROOT, pr...
120 [(ethereum, nmod, contract), (smart, amod, con...
121 [(ripple, amod, aspect), (fast, amod, aspect),...
122 [(litecoin, ROOT, litecoin), (fast, amod, bloc...
123 [(chainlink, nsubj, bring), (bring, ROOT, brin...
Name: parsed_dependencies, Length: 124, dtype: object
from spacy import displacy
from ipywidgets import interact
nlp = spacy.load("en_core_web_sm")
def plot_dependency(index):
text = data['text'].iloc[index]
doc = nlp(text)
return displacy.render(doc, style="dep", jupyter=True)
# use interact from ipywidgets to create a slider for dataframe index
interact(plot_dependency, index=(0, len(data)-1))
interactive(children=(IntSlider(value=61, description='index', max=123), Output()), _dom_classes=('widget-inte…
<function __main__.plot_dependency(index)>
#pip install jupyter_dash
import dash
from dash import dcc, html
import plotly.graph_objects as go
import pandas as pd
import numpy as np
similarity_data = np.random.rand(124, 124)
app = dash.Dash(__name__)
app.layout = html.Div([
dcc.Slider(
id='num-docs-slider',
min=1,
max=124,
step=1,
value=124,
),
dcc.Graph(id='heatmap-graph')
])
@app.callback(
dash.dependencies.Output('heatmap-graph', 'figure'),
[dash.dependencies.Input('num-docs-slider', 'value')])
def update_graph(num_docs):
figure = go.Figure(data=go.Heatmap(
z=similarity_data[:num_docs, :num_docs],
x=list(range(1, num_docs + 1)),
y=list(range(1, num_docs + 1)),
text=np.around(similarity_data[:num_docs, :num_docs], decimals=2), # show values in each cell
hoverinfo='text', # show text when hovering
hoverongaps=False))
figure.update_layout(
title=f'Heatmap of first {num_docs} docs',
xaxis_nticks=40,
yaxis_nticks=40,
autosize=False,
width=800,
height=800,
margin=dict(
l=50,
r=50,
b=100,
t=100,
pad=4
))
return figure
if __name__ == '__main__':
app.run_server(debug=True)